* CFI Model ICD10
* Versioned in Stata V16 based upon original SAS code provided by Dae Hyun Kim
* Authored by: Doug Bedell at Westat

* This code reads in relevant CFI datasets from provided code base,
* and outputs a CFI Measure for each patient.

* NOTE: Implemented using standard Stata without extensions. For large data sets
* this may not be the most efficient method. Consider replacing the full
* pair-wise CROSS calls with another method such as RANGEJOIN from SSC.
* Load rangejoin from SSC for ranged lookups
* If rangejoin not installed, install first with this line
* ssc install rangejoin

* Set input/output path for files
cd C:/CFI-Stata

* Read in all study IDs (patid)
import delimited "ids.txt"
save "ids.dta", replace
clear

* Read in ICD-9 DX data (patid, dx)
import delimited "dx09.txt", stringcols(2)
save "dx09.dta", replace
clear

* Read in ICD-9 DX lookup data
import delimited using "CFI_ICD9CM_V32.csv", stringcols(2)
drop description icd_ver
save "dx9lookup.dta", replace
clear

* Read in ICD-10 DX data (patid, dx)
import delimited "dx10.txt"
save "dx10.dta", replace
clear

* Read in ICD-10 DX lookup data
import delimited "CFI_ICD10CM_V2020.csv"
drop description icd_ver
save "dx10lookup.dta", replace
clear

* Read in PX data (patid, px)
import delimited "px.txt"
save "px.dta", replace
clear

* Read in PX lookup data. Implements PX section of SAS codes_org.
import delimited "pxlookup_stata.txt"
save "pxlookup_stata.dta", replace
clear

* Read in model disease weights lookup
import delimited "disease_weight.txt"
save "disease_weight.dta", replace
clear

* NOTE: Since we only need to score each DX/PX once per patient, remove duplicates
* prior to any lookup.
* Get disease numbers for ICD-9 diagnoses file.
use dx09
duplicates drop
joinby dx using dx9lookup
drop dx
sort patid disease_number
save "dx09.dta", replace
clear

* Get disease numbers for ICD-10 diagnoses file
use dx10
duplicates drop
joinby dx using dx10lookup
drop dx
sort patid disease_number
save "dx10.dta", replace
clear

* Get disease numbers for procedures file
use px
duplicates drop
cross using pxlookup_stata
keep if inrange(px,start,stop)
* Remove any row having a procedure code less than 5 chars or ending in a letter
drop if strlen(px) !=5 | uisletter(substr(px,5,1))
drop px start stop
sort patid disease_number
save "px.dta", replace
clear

* Assign dummy disease_number = 0 for all study IDs.
* This will have the effect of assigning the default weight (ModelIntercept)
* for any PatID that is not included in the DX9, DX10 or PX file
use ids
duplicates drop
generate disease_number = 0
save "ids.dta", replace
clear

* Combine the data, keeping only patient ID and disease number
use dx09, clear
append using dx10
append using px
append using ids

* Remove duplicates. Each DX/PX should only be weighted once.
duplicates drop
sort patid disease_number
save "diseasedatasort.dta", replace
clear

* Assign weights
* Merge the disease weights on to the disease data
use diseasedatasort
duplicates drop
joinby disease_number using disease_weight, unmatched(master)
drop _merge
* Ensure missings are set to 0 so patients without DX or PX do not get dropped
replace weight = 0 if missing(weight)
sort patid disease_number
save "diseasedatasort.dta", replace
clear

* Calculate frailty scores by summing the weights of records grouped by patient ID.
* ModelIntercept value added to every score. Default score for those with no DX/PX.
use diseasedatasort, clear
drop disease_number
collapse (sum) weight, by(patid)
* Derived from trained model, ModelIntercept = 0.10288
* Store in double, not float. Otherwise decimal precision is wrong
generate double frailty_index = round(weight + 0.10288, 0.00001)
drop weight
save "scores.dta", replace

* Write to CSV
use scores, replace
export delimited "frailty_output_stata.csv", replace
